Lab11_PM566

Lab 11 - Interactive Visualization

November 10, 2023

Grab lab file using command line:

# Step 1
cd ~/Documents
mkdir lab11
cd lab11

# Step 2
wget https://raw.githubusercontent.com/USCbiostats/PM566/master/website/content/assignment/11-lab.Rmd

And remember to set eval=TRUE

Learning Goals

  • Read in and process the COVID dataset from the New York Times GitHub repository

  • Create interactive graphs of different types using plot_ly() and ggplotly() functions

  • Customize the hoverinfo and other plot features

  • Create a Choropleth map using plot_geo()

Lab Description

We will work with COVID data downloaded from the New York Times. The dataset consists of COVID-19 cases and deaths in each US state during the course of the COVID epidemic.

The objective of this lab is to explore relationships between cases, deaths, and population sizes of US states, and plot data to demonstrate this

Steps

I. Reading and processing the New York Times (NYT) state-level COVID-19 data

1. Read in the data

## data extracted from New York Times state-level data from NYT Github repository
# https://github.com/nytimes/covid-19-data

## state-level population information from us_census_data available on GitHub repository:
# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data
library(data.table)
library(ggplot2)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(magrittr)
library(tidyr)

Attaching package: 'tidyr'
The following object is masked from 'package:magrittr':

    extract
### FINISH THE CODE HERE ###
# load COVID state-level data from NYT
cv_states <- as.data.frame(fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))

### FINISH THE CODE HERE ###
# load state population data
state_pops <- as.data.frame(fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL

### FINISH THE CODE HERE
cv_states <- merge(cv_states, state_pops, by= "state")
head(cv_states)
    state       date fips   cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04    1 1587224  21263      1    4887871    96.50939  AL
2 Alabama 2020-04-25    1    6213    213      1    4887871    96.50939  AL
3 Alabama 2023-02-26    1 1638348  21400      1    4887871    96.50939  AL
4 Alabama 2022-12-03    1 1549285  21129      1    4887871    96.50939  AL
5 Alabama 2020-05-06    1    8691    343      1    4887871    96.50939  AL
6 Alabama 2021-04-21    1  524367  10807      1    4887871    96.50939  AL

2. Look at the data

  • Inspect the dimensions, head, and tail of the data

  • Inspect the structure of each variables. Are they in the correct format?

dim(cv_states)
[1] 58094     9
head(cv_states)
    state       date fips   cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04    1 1587224  21263      1    4887871    96.50939  AL
2 Alabama 2020-04-25    1    6213    213      1    4887871    96.50939  AL
3 Alabama 2023-02-26    1 1638348  21400      1    4887871    96.50939  AL
4 Alabama 2022-12-03    1 1549285  21129      1    4887871    96.50939  AL
5 Alabama 2020-05-06    1    8691    343      1    4887871    96.50939  AL
6 Alabama 2021-04-21    1  524367  10807      1    4887871    96.50939  AL
tail(cv_states)
        state       date fips  cases deaths geo_id population pop_density abb
58089 Wyoming 2022-09-11   56 175290   1884     56     577737    5.950611  WY
58090 Wyoming 2022-08-21   56 173487   1871     56     577737    5.950611  WY
58091 Wyoming 2021-01-26   56  51152    596     56     577737    5.950611  WY
58092 Wyoming 2021-02-21   56  53795    662     56     577737    5.950611  WY
58093 Wyoming 2021-08-22   56  70671    809     56     577737    5.950611  WY
58094 Wyoming 2021-03-20   56  55581    693     56     577737    5.950611  WY
str(cv_states)
'data.frame':   58094 obs. of  9 variables:
 $ state      : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
 $ date       : IDate, format: "2023-01-04" "2020-04-25" ...
 $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ cases      : int  1587224 6213 1638348 1549285 8691 524367 1321892 1088370 1153149 814025 ...
 $ deaths     : int  21263 213 21400 21129 343 10807 19676 16756 16826 15179 ...
 $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 $ abb        : chr  "AL" "AL" "AL" "AL" ...

3. Format the data

  • Make date into a date variable

  • Make state into a factor variable

  • Order the data first by state, second by date

  • Confirm the variables are now correctly formatted

  • Inspect the range values for each variable. What is the date range? The range of cases and deaths?

# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")

# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)

### FINISH THE CODE HERE 
# order the data first by state, second by date
cv_states <- cv_states[order(cv_states$state, cv_states$date), ]


# Confirm the variables are now correctly formatted
str(cv_states)
'data.frame':   58094 obs. of  9 variables:
 $ state      : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ date       : Date, format: "2020-03-13" "2020-03-14" ...
 $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ cases      : int  6 12 23 29 39 51 78 106 131 157 ...
 $ deaths     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 $ abb        : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
       state       date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
597  Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
282  Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
12   Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
266  Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
78   Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
tail(cv_states)
        state       date fips  cases deaths geo_id population pop_density abb
57902 Wyoming 2023-03-18   56 185640   2009     56     577737    5.950611  WY
57916 Wyoming 2023-03-19   56 185640   2009     56     577737    5.950611  WY
57647 Wyoming 2023-03-20   56 185640   2009     56     577737    5.950611  WY
57867 Wyoming 2023-03-21   56 185800   2014     56     577737    5.950611  WY
58057 Wyoming 2023-03-22   56 185800   2014     56     577737    5.950611  WY
57812 Wyoming 2023-03-23   56 185800   2014     56     577737    5.950611  WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
       state       date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
597  Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
282  Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
12   Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
266  Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
78   Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
summary(cv_states)
           state            date                 fips           cases         
 Washington   : 1158   Min.   :2020-01-21   Min.   : 1.00   Min.   :       1  
 Illinois     : 1155   1st Qu.:2020-12-06   1st Qu.:16.00   1st Qu.:  112125  
 California   : 1154   Median :2021-09-11   Median :29.00   Median :  418120  
 Arizona      : 1153   Mean   :2021-09-10   Mean   :29.78   Mean   :  947941  
 Massachusetts: 1147   3rd Qu.:2022-06-17   3rd Qu.:44.00   3rd Qu.: 1134318  
 Wisconsin    : 1143   Max.   :2023-03-23   Max.   :72.00   Max.   :12169158  
 (Other)      :51184                                                          
     deaths           geo_id        population        pop_density       
 Min.   :     0   Min.   : 1.00   Min.   :  577737   Min.   :    1.292  
 1st Qu.:  1598   1st Qu.:16.00   1st Qu.: 1805832   1st Qu.:   43.659  
 Median :  5901   Median :29.00   Median : 4468402   Median :  107.860  
 Mean   : 12553   Mean   :29.78   Mean   : 6397965   Mean   :  423.031  
 3rd Qu.: 15952   3rd Qu.:44.00   3rd Qu.: 7535591   3rd Qu.:  229.511  
 Max.   :104277   Max.   :72.00   Max.   :39557045   Max.   :11490.120  
                                                     NA's   :1106       
      abb       
 WA     : 1158  
 IL     : 1155  
 CA     : 1154  
 AZ     : 1153  
 MA     : 1147  
 WI     : 1143  
 (Other):51184  
min(cv_states$date)
[1] "2020-01-21"
max(cv_states$date)
[1] "2023-03-23"

4. Add new_cases and new_deaths and correct outliers

  • Add variables for new cases, new_cases, and new deaths, new_deaths:

    • Hint: You can set new_cases equal to the difference between cases on date i and date i-1, starting on date i=2
  • Filter to dates after June 1, 2021

  • Use plotly for EDA: See if there are outliers or values that don’t make sense for new_cases and new_deaths. Which states and which dates have strange values? California 2021-06-05

  • Correct outliers: Set negative values for new_cases or new_deaths to 0

  • Recalculate cases and deaths as cumulative sum of updated new_cases and new_deaths

  • Get the rolling average of new cases and new deaths to smooth over time

  • Inspect data again interactively

# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])
  cv_subset = cv_subset[order(cv_subset$date),]

  # add starting level for new cases and deaths
  cv_subset$new_cases = cv_subset$cases[1]
  cv_subset$new_deaths = cv_subset$deaths[1]

  ### FINISH THE CODE HERE
  for (j in 2:nrow(cv_subset)) {
    cv_subset$new_cases[j] =  cv_subset$cases[j] - cv_subset$cases[j - 1]
    cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j - 1]
  }

  # include in main dataset
  cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
  cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}

# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2021-06-01")

### FINISH THE CODE HERE
# Inspect outliers in new_cases using plotly
#p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state))  + ___[fillin]_____ + geom_point(size = .5, alpha = 0.5)
#ggplotly(p1)
#p1<-NULL # to clear from workspace

#p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + ___[fillin]_____ + geom_point(size = .5, alpha = 0.5)
#ggplotly(p2)
#p2<-NULL # to clear from workspace

# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0

# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])

  # add starting level for new cases and deaths
  cv_subset$cases = cv_subset$cases[1]
  cv_subset$deaths = cv_subset$deaths[1]

  ### FINISH CODE HERE
  for (j in 2:nrow(cv_subset)) {
    cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j - 1]
    cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$new_deaths[j] + cv_subset$deaths[j - 1]
  }
  # include in main dataset
  cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
  cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}

# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)

# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
#p2=NULL

5. Add additional variables

  • Add population-normalized (by 100,000) variables for each variable type (rounded to 1 decimal place). Make sure the variables you calculate are in the correct format (numeric). You can use the following variable names:

    • per100k = cases per 100,000 population

    • newper100k= new cases per 100,000

    • deathsper100k = deaths per 100,000

    • newdeathsper100k = new deaths per 100,000

  • Add a “naive CFR” variable representing deaths / cases on each date for each state

  • Create a dataframe representing values on the most recent date, cv_states_today, as done in lecture

### FINISH CODE HERE


# Add population normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases / (cv_states$population/100000), 1), nsmall = 1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases / (cv_states$population/100000), 1), nsmall = 1))
Warning: NAs introduced by coercion
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths / (cv_states$population/100000), 1), nsmall = 1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths / (cv_states$population / 100000), 1), nsmall = 1))
Warning: NAs introduced by coercion
# Add a naive CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths * 100 / cases), 2))

# Create a `cv_states_today` data frame
cv_states_today = subset(cv_states, date == max(cv_states$date))

II. Scatterplots

6. Explore scatterplots using plot_ly()

  • Create a scatterplot using plot_ly() representing pop_density vs. various variables (e.g. cases, per100k, deaths, deathsper100k) for each state on most recent date (cv_states_today)

    • Color points by state and size points by state population

    • Use hover to identify any outliers.

    • Remove those outliers and replot.

  • Choose one plot. For this plot:

    • Add hoverinfo specifying the state name, cases per 100k, and deaths per 100k, similarly to how we did this in the lecture notes

    • Add layout information to title the chart and the axes

    • Enable hovermode = "compare"

    ### FINISH CODE HERE
    
    cv_states_today
                         state       date fips    cases deaths geo_id population
    661                Alabama 2023-03-23    1  1648385  32106      1    4887871
    1322                Alaska 2023-03-23    2   308893   2526      2     737438
    1983               Arizona 2023-03-23    4  2451062  48814      4    7171646
    2644              Arkansas 2023-03-23    5  1008303  20306      5    3013825
    3305            California 2023-03-23    6 12170765 147230      6   39557045
    3966              Colorado 2023-03-23    8  1781053  22453      8    5695564
    4627           Connecticut 2023-03-23    9   978456  16376      9    3572665
    5288              Delaware 2023-03-23   10   334147   5043     10     967171
    5949  District of Columbia 2023-03-23   11   178238   1902     11     702455
    6610               Florida 2023-03-23   12  7583396 137442     12   21299325
    7271               Georgia 2023-03-23   13  2985191  61929     13   10519475
    7932                Hawaii 2023-03-23   15   372408   3205     15    1420491
    8593                 Idaho 2023-03-23   16   522919   8830     16    1754208
    9254              Illinois 2023-03-23   17  4107931  58009     17   12741080
    9915               Indiana 2023-03-23   18  2058560  38732     18    6691878
    10576                 Iowa 2023-03-23   19   907421  15495     19    3156145
    11237               Kansas 2023-03-23   20   940907  15515     20    2911505
    11898             Kentucky 2023-03-23   21  1739015  29587     21    4468402
    12559            Louisiana 2023-03-23   22  1580709  27120     22    4659978
    13220                Maine 2023-03-23   23   320004   5143     23    1338404
    13881             Maryland 2023-03-23   24  1369957  23768     24    6042718
    14542        Massachusetts 2023-03-23   25  2230150  38539     25    6902149
    15203             Michigan 2023-03-23   26  3068195  64254     26    9995915
    15864            Minnesota 2023-03-23   27  1784263  22408     27    5611179
    16525          Mississippi 2023-03-23   28   993035  19592     28    2986530
    17186             Missouri 2023-03-23   29  1794222  38480     29    6126452
    17847              Montana 2023-03-23   30   330930   5795     30    1062305
    18508             Nebraska 2023-03-23   31   576365   7867     31    1929268
    19169               Nevada 2023-03-23   32   893067  18622     32    3034392
    19830        New Hampshire 2023-03-23   33   379689   4683     33    1356458
    20491           New Jersey 2023-03-23   34  3057442  45981     34    8908520
    21152           New Mexico 2023-03-23   35   673541  13986     35    2095428
    21813             New York 2023-03-23   36  6805271 107832     36   19542209
    22474       North Carolina 2023-03-23   37  3481732  46380     37   10383620
    23135         North Dakota 2023-03-23   38   288106   3525     38     760077
    23796                 Ohio 2023-03-23   39  3415254  64261     39   11689442
    24457             Oklahoma 2023-03-23   40  1295832  25813     40    3943079
    25118               Oregon 2023-03-23   41   967157  16199     41    4190713
    25779         Pennsylvania 2023-03-23   42  3543532  74216     42   12807060
    26440          Puerto Rico 2023-03-23   72  1139345   9191     72    3195153
    27101         Rhode Island 2023-03-23   44   461807   5134     44    1057315
    27762       South Carolina 2023-03-23   45  1840458  30682     45    5084127
    28423         South Dakota 2023-03-23   46   280525   4432     46     882235
    29084            Tennessee 2023-03-23   47  2477417  45732     47    6770010
    29745                Texas 2023-03-23   48  8458361 137519     48   28701845
    30406                 Utah 2023-03-23   49  1093077   8332     49    3161105
    31067              Vermont 2023-03-23   50   153198   1623     50     626299
    31728             Virginia 2023-03-23   51  2299096  36398     51    8517685
    32389           Washington 2023-03-23   53  1944818  26275     53    7535591
    33050        West Virginia 2023-03-23   54   645710  13715     54    1805832
    33711            Wisconsin 2023-03-23   55  2014524  25127     55    5813568
    34372              Wyoming 2023-03-23   56   186424   3308     56     577737
           pop_density abb new_cases new_deaths per100k newper100k deathsper100k
    661      96.509389  AL       280          3 33724.0        5.7         656.9
    1322      1.291523  AK        71          0 41887.3        9.6         342.5
    1983     63.135855  AZ       379          5 34177.1        5.3         680.7
    2644     57.919684  AR        52          1 33455.9        1.7         673.8
    3305    253.906502  CA      2433         23 30767.6        6.2         372.2
    3966     54.955978  CO       329          5 31270.9        5.8         394.2
    4627    737.744600  CT       126          2 27387.3        3.5         458.4
    5288    496.432460  DE        48          1 34548.9        5.0         521.4
    5949  11490.119540  DC        17          0 25373.6        2.4         270.8
    6610    397.015754  FL      1688         42 35603.9        7.9         645.3
    7271    182.264789  GA       398          5 28377.8        3.8         588.7
    7932    221.176916  HI        72          1 26216.9        5.1         225.6
    8593     21.225799  ID       101          3 29809.4        5.8         503.4
    9254    229.511156  IL      1223          6 32241.6        9.6         455.3
    9915    186.787520  IN         0          9 30762.1        0.0         578.8
    10576    56.507024  IA       240          4 28750.9        7.6         490.9
    11237    35.610733  KS       184          3 32316.9        6.3         532.9
    11898   113.151791  KY       306          9 38918.1        6.8         662.1
    12559   107.860112  LA       600          5 33921.0       12.9         582.0
    13220    43.391688  ME       128          2 23909.4        9.6         384.3
    13881   622.261727  MD       294          5 22671.2        4.9         393.3
    14542   884.749922  MA       363          7 32311.0        5.3         558.4
    15203   176.597078  MI       853         13 30694.5        8.5         642.8
    15864    70.469675  MN       373          7 31798.4        6.6         399.3
    16525    63.645625  MS       153          2 33250.5        5.1         656.0
    17186    89.117471  MO       390          7 29286.5        6.4         628.1
    17847     7.298751  MT        93          3 31152.1        8.8         545.5
    18508    25.114922  NE       216          0 29874.8       11.2         407.8
    19169    27.640602  NV       159          2 29431.5        5.2         613.7
    19830   151.500597  NH        87          1 27991.2        6.4         345.2
    20491  1211.317235  NJ       534          3 34320.4        6.0         516.1
    21152    17.273065  NM       192          2 32143.4        9.2         667.5
    21813   414.702494  NY      1041         23 34823.4        5.3         551.8
    22474   213.569492  NC       630          5 33531.0        6.1         446.7
    23135    11.015710  ND        73          1 37904.8        9.6         463.8
    23796   286.079885  OH      1002          6 29216.6        8.6         549.7
    24457    57.482602  OK       291          5 32863.5        7.4         654.6
    25118    43.658854  OR       267          3 23078.6        6.4         386.5
    25779   286.234554  PA       743         15 27668.6        5.8         579.5
    26440           NA  PR       323          2 35658.5       10.1         287.7
    27101  1022.650577  RI        75          1 43677.3        7.1         485.6
    27762   169.111176  SC       284          2 36200.1        5.6         603.5
    28423    11.637449  SD        92          0 31797.1       10.4         502.4
    29084   164.174127  TN       485          4 36594.0        7.2         675.5
    29745   109.860468  TX      1358         11 29469.7        4.7         479.1
    30406    38.458215  UT       152          1 34579.0        4.8         263.6
    31067    67.943995  VT        35          1 24460.8        5.6         259.1
    31728   215.736304  VA       389          1 26992.0        4.6         427.3
    32389   113.397227  WA       537          7 25808.4        7.1         348.7
    33050    75.113988  WV       147          1 35756.9        8.1         759.5
    33711   107.328157  WI       587          4 34652.1       10.1         432.2
    34372     5.950611  WY        23          1 32268.0        4.0         572.6
          newdeathsper100k naive_CFR
    661                0.1      1.95
    1322               0.0      0.82
    1983               0.1      1.99
    2644               0.0      2.01
    3305               0.1      1.21
    3966               0.1      1.26
    4627               0.1      1.67
    5288               0.1      1.51
    5949               0.0      1.07
    6610               0.2      1.81
    7271               0.0      2.07
    7932               0.1      0.86
    8593               0.2      1.69
    9254               0.0      1.41
    9915               0.1      1.88
    10576              0.1      1.71
    11237              0.1      1.65
    11898              0.2      1.70
    12559              0.1      1.72
    13220              0.1      1.61
    13881              0.1      1.73
    14542              0.1      1.73
    15203              0.1      2.09
    15864              0.1      1.26
    16525              0.1      1.97
    17186              0.1      2.14
    17847              0.3      1.75
    18508              0.0      1.36
    19169              0.1      2.09
    19830              0.1      1.23
    20491              0.0      1.50
    21152              0.1      2.08
    21813              0.1      1.58
    22474              0.0      1.33
    23135              0.1      1.22
    23796              0.1      1.88
    24457              0.1      1.99
    25118              0.1      1.67
    25779              0.1      2.09
    26440              0.1      0.81
    27101              0.1      1.11
    27762              0.0      1.67
    28423              0.0      1.58
    29084              0.1      1.85
    29745              0.0      1.63
    30406              0.0      0.76
    31067              0.2      1.06
    31728              0.0      1.58
    32389              0.1      1.35
    33050              0.1      2.12
    33711              0.1      1.25
    34372              0.2      1.77
    # pop_density vs. cases
    cv_states_today %>% 
      plot_ly(x = ~pop_density, y = ~cases, 
              type = "scatter", mode = 'markers', color = ~state,
              size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
    Warning: Ignoring 1 observations
    Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
    Returning the palette you asked for with that many colors
    
    Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
    Returning the palette you asked for with that many colors
    # filter out "District of Columbia"
    cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")
    
    # pop_density vs. cases after filtering
    cv_states_today_filter %>% 
      plot_ly(x = ~pop_density, y = ~cases, 
              type = "scatter", mode = 'markers', color = ~state,
              size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
    Warning: Ignoring 1 observations
    
    Warning: n too large, allowed maximum for palette Set2 is 8
    Returning the palette you asked for with that many colors
    
    Warning: n too large, allowed maximum for palette Set2 is 8
    Returning the palette you asked for with that many colors
    # pop_density vs. deathsper100k
    cv_states_today_filter %>% 
      plot_ly(x = ~pop_density, y = ~deathsper100k,
              type = "scatter", mode = 'markers', color = ~state,
              size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
    Warning: Ignoring 1 observations
    
    Warning: n too large, allowed maximum for palette Set2 is 8
    Returning the palette you asked for with that many colors
    
    Warning: n too large, allowed maximum for palette Set2 is 8
    Returning the palette you asked for with that many colors
    # Adding hoverinfo
    cv_states_today_filter %>% 
      plot_ly(x = ~pop_density, y = ~deathsper100k,
              type = "scatter", mode = 'markers', color = ~state,
              size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
              hoverinfo = 'text',
              text = ~paste( paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep="") , 
                             paste(" Deaths per 100k: ", deathsper100k , sep=""), sep = "<br>")) %>%
      layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
                      yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
             hovermode = "compare")
    Warning: Ignoring 1 observations
    
    Warning: n too large, allowed maximum for palette Set2 is 8
    Returning the palette you asked for with that many colors
    
    Warning: n too large, allowed maximum for palette Set2 is 8
    Returning the palette you asked for with that many colors

7. Explore scatterplot trend interactively using ggplotly() and geom_smooth()

  • For pop_density vs. newdeathsper100k create a chart with the same variables using gglot_ly()

  • Explore the pattern between \(x\) and \(y\) using geom_smooth()

    • Explain what you see. Do you think pop_density is a correlate of newdeathsper100k? No, I don’t think there is a relationship.
### FINISH CODE HERE
p <- ggplot(cv_states_today_scatter, aes(x=pop_density, y=deathsper100k, size=population)) + geom_point() + _______
ggplotly(p)
p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) + geom_point() +geom_smooth(method = 'lm', se = FALSE, color = 'red', size = 1.2, linetype = 'dashed')
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ggplotly(p)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).

8. Multiple line chart

  • Create a line chart of the naive_CFR for all states over time using plot_ly()

    • Use the zoom and pan tools to inspect the naive_CFR for the states that had an increase in September. How have they changed over time?
  • Create one more line chart, for Florida only, which shows new_cases and new_deaths together in one plot. Hint: use add_layer()

    • Use hoverinfo to “eyeball” the approximate peak of deaths and peak of cases. What is the time delay between the peak of cases and the peak of deaths?
### FINISH CODE HERE
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
### FINISH CODE HERE
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>% filter(state=="Florida") %>% plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>% add_lines(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines") 

9. Heatmaps

Create a heatmap to visualize new_cases for each state on each date greater than June 1st, 2021

  • Start by mapping selected features in the dataframe into a matrix using the tidyr package function pivot_wider(), naming the rows and columns, as done in the lecture notes

  • Use plot_ly() to create a heatmap out of this matrix. Which states stand out?

  • Repeat with newper100k variable. Now which states stand out?

  • Create a second heatmap in which the pattern of new_cases for each state over time becomes more clear by filtering to only look at dates every two weeks

### FINISH CODE HERE
# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>% dplyr::filter(date>as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T)
# Repeat with newper100k
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% dplyr::filter(date>as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T)
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2021-06-15"), as.Date("2021-11-01"), by= "2 weeks")

cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k)) #here
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T)

10. Map

  • Create a map to visualize the naive_CFR by state on October 15, 2021

  • Compare with a map visualizing the naive_CFR by state on most recent date

  • Plot the two maps together using subplot(). Make sure the shading is for the same range of values (google is your friend for this)

  • Describe the difference in the pattern of the CFR.

The cases per 100k by State as of 2023-11-10 show that there are higher cases in northern United States and Alaska compared to and cases per 100k in 2023-11-10

### For specified date

pick.date = "2021-10-15"

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>% filter(date==pick.date) %>% select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Make sure both maps are on the same color scale
shadeLimit <- 125

# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, text = ~hover, locations = ~state,
    color = ~newper100k, colors = 'Purples'
  )
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", pick.date), limits = c(0,shadeLimit))
fig <- fig %>% layout(
    title = paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),
    geo = set_map_details
  )
fig_pick.date <- fig

#############
### Map for today's date

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states_today %>%  select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, text = ~hover, locations = ~state,
    color = ~newper100k, colors = 'Purples'
  )
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", Sys.Date()), limits = c(0,shadeLimit))
fig <- fig %>% layout(
    title = paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),
    geo = set_map_details
  )
fig_Today <- fig


### Plot together 
subplot(fig_pick.date, fig_Today, nrows = 2, margin = .05)

PM566: Introduction to Health Data Science - PM 566 (Fall 2023)

University of Southern California

Department of Population and Public Health Sciences

Kelly Street

kelly.street@usc.edu